# Set working directory
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

pacman::p_load(tidyverse, haven, nnet, texreg, lubridate, tidyquant, scales, corrplot, weights, jtools, ggthemes, thematic, REdaS, survey, ggeffects, ggstance, readxl)

# Load data
dat_pre <- read_csv("2021_Federal_Pre_Election.csv")
dat_post <- read_csv("2021_Federal_Post_Election.csv")

# Remove testing observations (from researchers)
dat_pre <- dat_pre %>% 
  filter(DistributionChannel != "preview") 

dat_post <- dat_post %>% 
  filter(DistributionChannel != "preview")

# Reformat dates and remove election day responses
dat_pre <- dat_pre %>% 
  mutate(StartDate = as.Date(StartDate, format = "%m/%d/%Y %H:%M"),
         EndDate = as.Date(EndDate, format = "%m/%d/%Y %H:%M")) %>% 
  filter(EndDate < "2021-09-21")

dat_post <- dat_post %>% 
  mutate(StartDate = as.Date(StartDate, format = "%m/%d/%Y %H:%M"),
         EndDate = as.Date(EndDate, format = "%m/%d/%Y %H:%M"))

# Remove duplicate responses
dat_pre <- dat_pre %>%
  group_by(PID) %>%
  filter(if(any(gc == 1, na.rm = T)) gc == 1 else EndDate == max(EndDate)) %>%
  filter(Progress == max(Progress)) %>%
  filter(`Duration (in seconds)` == max(`Duration (in seconds)`)) %>%
  ungroup() %>% 
  filter(`Duration (in seconds)` != 0)

dat_post <- dat_post %>%
  group_by(PID) %>%
  filter(if(any(gc == 1, na.rm = T)) gc == 1 else EndDate == max(EndDate)) %>%
  filter(Progress == max(Progress)) %>%
  filter(`Duration (in seconds)` == max(`Duration (in seconds)`)) %>%
  ungroup() %>% 
  filter(`Duration (in seconds)` != 0)

length(unique(dat_pre$PID)) # 10,070
length(unique(dat_post$PID)) # 3,723

# Did not answer any question (these were the first questions)
sum(is.na(dat_pre$citizen)) #105
sum(is.na(dat_post$gender)) #29

dat_pre <- dat_pre %>% 
  filter(!is.na(citizen)) # 9,965

dat_post <- dat_post %>% 
  filter(!is.na(gender)) # 3,694

# Not eligible
inelegible_pre <- dat_pre %>% 
  filter(noncitizen == 1 | under18==1) # 339
inelegible_post <- dat_post %>% 
  filter(noncitizen == 1 | under18==1) # 26

dat_pre <- dat_pre %>% 
  filter(!PID %in% inelegible_pre$PID) # 9626

dat_post <- dat_post %>% 
  filter(!PID %in% inelegible_post$PID) # 3668

## Excluded after quota-based questions
sum(is.na(dat_pre$postalcode) & (!is.na(dat_pre$province) | !is.na(dat_pre$age) | !is.na(dat_pre$gender)))
dat_pre_quota_excl <- dat_pre %>% 
  filter(is.na(dat_pre$postalcode) & (!is.na(dat_pre$province) | !is.na(dat_pre$age) | !is.na(dat_pre$gender)))
dat_pre <- dat_pre %>% 
  filter(!PID %in% dat_pre_quota_excl$PID)

# Incomplete answers
dat_pre_incomplete <- dat_pre %>% 
  filter(gc != 1 | is.na(gc))
dat_post_incomplete <- dat_post %>% 
  filter(gc != 1 | is.na(gc))

### Keep only completes

dat_pre <- dat_pre %>% 
  filter(gc == 1)

dat_post <- dat_post %>% 
  filter(gc == 1)

sum(dat_post$EndDate <= "2021-10-04")/sum(!is.na(dat_post$EndDate))

## Remove inattentive respondents

speeders_inattentive_pre <- dat_pre %>% 
  filter(`Duration (in seconds)` < (median(dat_pre$`Duration (in seconds)`, na.rm = T)/3) &
           issues_8 != 3)

speeders_inattentive_post <- dat_post %>% 
  filter(`Duration (in seconds)` < (median(dat_post$`Duration (in seconds)`, na.rm = T)/3) &
           misinfo_attitudes_6 != 3)

dat_pre <- dat_pre %>% 
  filter(!PID %in% speeders_inattentive_pre$PID)

dat_post <- dat_post %>% 
  filter(!PID %in% speeders_inattentive_post$PID)
 
# Calculate the weights
weighting <- function(dat){
  require(tidyverse)
  require(survey)
  
  # Prepare the data ---------------------------------------------------#
  dat <- dat %>% 
    dplyr::mutate(region1=NA,
                  region1 = ifelse(province == 9, "West", 
                                   ifelse(province == 10, "West",
                                          ifelse(province == 7, "West",
                                                 ifelse(province == 3, "Atlantic",
                                                        ifelse(province == 1, "Atlantic",
                                                               ifelse(province == 14, NA,
                                                                      ifelse(province == 4, "Atlantic",
                                                                             ifelse(province == 15, NA, 
                                                                                    ifelse(province == 6, "Ontario",
                                                                                           ifelse(province == 2, "Atlantic",
                                                                                                  ifelse(province == 5, "Quebec",
                                                                                                         ifelse(province == 8, "West",
                                                                                                                ifelse(province == 16, NA, region1))))))))))))),
                  female=NA, 
                  female = ifelse(gender == 2, 1, 
                                  ifelse(gender == 1, 0,
                                         ifelse(gender == 3, NA, female))),
                  age_cat=NA,
                  age_cat = ifelse(age <= 34, 1,
                                   ifelse(age >= 35 & age <= 54, 2,
                                          ifelse(age >= 55, 3, age_cat))))
  
  dat_atlantic <- dat %>% dplyr::filter(region1 == "Atlantic") %>% dplyr::select(region1, age_cat, female, ResponseId) %>% na.omit()
  dat_quebec <- dat %>% filter(region1 == "Quebec") %>% dplyr::select(region1, age_cat, female, ResponseId) %>% na.omit()
  dat_ontario <- dat %>% filter(region1 == "Ontario") %>% dplyr::select(region1, age_cat, female, ResponseId) %>% na.omit()
  dat_west <- dat %>% filter(region1 == "West") %>% dplyr::select(region1, age_cat, female, ResponseId) %>% na.omit()
  
  ATL.svy.unweighted <- svydesign(ids=~1, data=dat_atlantic, weights = NULL)
  QC.svy.unweighted <- svydesign(ids=~1, data=dat_quebec, weights = NULL)
  ON.svy.unweighted <- svydesign(ids=~1, data=dat_ontario, weights = NULL)
  WE.svy.unweighted <- svydesign(ids=~1, data=dat_west, weights = NULL)
  
  # Weights creation ---------------------------------------------------#
  # Marginal probabilities for the variables that we want to weight ....#
  # Gender .............................................................#
  # Atlantic
  ATL.gender.dist <- tibble(female = c("0", "1"), 
                            Freq = nrow(dat_atlantic)*c(0.481, 0.519))
  
  #Qu?bec
  QC.gender.dist <- tibble(female = c("0", "1"),
                           Freq = nrow(dat_quebec) * c(0.487, 0.513))
  
  # Ontario
  ON.gender.dist <- tibble(female = c("0", "1"),
                           Freq = nrow(dat_ontario) * c(0.481, 0.519))
  
  # West
  WE.gender.dist <- tibble(female = c("0", "1"),
                           Freq = nrow(dat_west) * c(0.491, 0.509))
  
  # Age ................................................................#
  # Atlantic
  ATL.age.dist <- tibble(age_cat = c("1", "2", "3"),
                         Freq = nrow(dat_atlantic) * c(0.232, 0.332, 0.436))
  
  #Qu?bec
  QC.age.dist <- tibble(age_cat = c("1", "2", "3"),
                        Freq = nrow(dat_quebec) * c(0.256, 0.334, 0.410))
  
  # Ontario
  ON.age.dist <- tibble(age_cat = c("1", "2", "3"),
                        Freq = nrow(dat_ontario) * c(0.275, 0.345, 0.380))
  
  # West
  WE.age.dist <- tibble(age_cat = c("1", "2", "3"),
                        Freq = nrow(dat_west) * c(0.291, 0.343, 0.366))
  
  # weight the current data by the population values -------------------#
  rake_atlantic <- rake(design = ATL.svy.unweighted,
                        sample.margins = list(~female, ~age_cat),
                        population.margins = list(ATL.gender.dist, ATL.age.dist))
  
  rake_atlantic <- trimWeights(rake_atlantic, lower=0.1, upper=8, strict=TRUE)
  
  dat_atlantic$wt <- weights(rake_atlantic)
  
  rake_quebec <- rake(design = QC.svy.unweighted,
                      sample.margins = list(~female, ~age_cat),
                      population.margins = list(QC.gender.dist, QC.age.dist))
  
  rake_quebec <- trimWeights(rake_quebec, lower=0.1, upper=8, strict=TRUE)
  
  dat_quebec$wt <- weights(rake_quebec)
  
  rake_ontario <- rake(design = ON.svy.unweighted,
                       sample.margins = list(~female, ~age_cat),
                       population.margins = list(ON.gender.dist, ON.age.dist))
  
  rake_ontario <- trimWeights(rake_ontario, lower=0.1, upper=8, strict=TRUE)
  
  dat_ontario$wt <- weights(rake_ontario)
  
  rake_west <- rake(design = WE.svy.unweighted,
                    sample.margins = list(~female, ~age_cat),
                    population.margins = list(WE.gender.dist, WE.age.dist))
  
  rake_west <- trimWeights(rake_west, lower=0.1, upper=8, strict=TRUE)
  
  dat_west$wt <- weights(rake_west)
  
  # Creating the new dataset with weights ------------------------------#
  Dat_weights <- rbind(dat_atlantic, dat_ontario, dat_quebec, dat_west)
  
  # Adjust for regional imbalance
  # Define target population proportions for each region (these should be real population values)
  region_pop_props <- c(Atlantic = 0.07692, Quebec = 0.24038, Ontario = 0.38365, West = 0.31346)  
  region_sample_props <- Dat_weights %>% count(region1) %>% mutate(prop = n / sum(n))
  region_adj <- region_pop_props[region_sample_props$region1] / region_sample_props$prop
  
  # Apply regional weight adjustment
  Dat_weights <- Dat_weights %>%
    left_join(region_sample_props %>% mutate(region_wt = region_adj), by = "region1") %>%
    mutate(wt_region = wt * region_wt)
  
  # Merge final weights back with original data
  data_weighted <- dat %>%
    left_join(Dat_weights %>% select(ResponseId, wt, wt_region), by = "ResponseId") %>%
    mutate(wt = ifelse(is.na(wt), 1, wt),
           wt_region = ifelse(is.na(wt_region), 1, wt_region))
  
  return(data_weighted %>% select(ResponseId, wt, wt_region))
}

wt_pre <- weighting(dat_pre)
wt_post <- weighting(dat_post)

# Merge data
dat_pre <- right_join(dat_pre, wt_pre, by = "ResponseId")
dat_post <- right_join(dat_post, wt_post, by = "ResponseId")
colnames(dat_pre) <- paste0(colnames(dat_pre), "_pre")
colnames(dat_post) <- paste0(colnames(dat_post), "_post")
dat_pre$PID <- dat_pre$PID_pre
dat_post$PID <- dat_post$PID_post
cemp <- left_join(dat_pre, dat_post, by = "PID")
nrow(cemp)

# Recode variables
cemp <- cemp %>% 
  mutate(
    female_pre = case_when(
      gender_pre == 2 ~ 1, 
      gender_pre == 1 ~ 0, 
      TRUE ~ NA_real_
    ),
    educ_cat3_pre = case_when(
      education_pre %in% c(1:6) ~ 0, 
      education_pre %in% c(7:8) ~ 0.5,
      education_pre %in% c(9:11) ~ 1, 
      TRUE ~ NA_real_
    ), 
    province1_pre = case_when(
      province_pre == 9 ~ "Alberta",
      province_pre == 10 ~ "British Columbia",
      province_pre == 7 ~ "Manitoba",
      province_pre %in% c(1:4) ~ "Atlantic",
      province_pre == 6 ~ "Ontario",
      province_pre == 5 ~ "Quebec",
      province_pre == 8 ~ "Saskatchewan",
      TRUE ~ "Territories"
    ))

# Given that this is an omnibus survey, we have selected the variables we used for the analyses. You can contact the authors if you are interested in other variables named in the codebook.
# Create a vector of all relevant variable names
vars_to_keep <- c(
# Identification
  "PID",
# Completion time
 "Duration (in seconds)_pre", "Duration (in seconds)_post",
# Attention check
  "issues_8_pre", "misinfo_attitudes_6_post", 
# Trust and perceptions
 "trust_accuracy_2_post", "fair_election_pre", "trust_accuracy_3_post", "fair_election_post",
# Mail-in ballot attitudes
 "mail_vote_pre", "mail_vote_post",
# Media habits
  "freq_news_pre", "freq_socmed_pre",
 "media_list_16_pre", "media_list_20_pre", "media_list_22_pre",
# Political orientations
  "party_rating_1_pre", "yesvote_post", "ideol_1_pre",
# Controls
 "interest_1_pre", "age_pre", "educ_cat3_pre", "female_pre", "province1_pre",
# Survey weights
  "wt_region_pre", "wt_region_post"
)

# Subset the data to keep only relevant variables
cemp <- cemp %>% select(all_of(vars_to_keep))

# Export output
write_csv(cemp, file="cemp.csv")